# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# [START googlegenaisdk_live_conversation_audio_with_audio]

import asyncio
import base64

from google import genai
from google.genai.types import (
    AudioTranscriptionConfig,
    Blob,
    HttpOptions,
    LiveConnectConfig,
    Modality,
)
import numpy as np

from scipy.io import wavfile

# The number of audio frames to send in each chunk.
CHUNK = 4200
CHANNELS = 1
MODEL = "gemini-live-2.5-flash-preview-native-audio-09-2025"

# The audio sample rate expected by the model.
INPUT_RATE = 16000
# The audio sample rate of the audio generated by the model.
OUTPUT_RATE = 24000

# The sample width for 16-bit audio, which is standard for this type of audio data.
SAMPLE_WIDTH = 2

client = genai.Client(http_options=HttpOptions(api_version="v1beta1"), location="us-central1")


def read_wavefile(filepath: str) -> tuple[str, str]:
    # Read the .wav file using scipy.io.wavfile.read
    rate, data = wavfile.read(filepath)
    # Convert the NumPy array of audio samples back to raw bytes
    raw_audio_bytes = data.tobytes()
    # Encode the raw bytes to a base64 string.
    # The result needs to be decoded from bytes to a UTF-8 string
    base64_encoded_data = base64.b64encode(raw_audio_bytes).decode("ascii")
    mime_type = f"audio/pcm;rate={rate}"
    return base64_encoded_data, mime_type


def write_wavefile(filepath: str, audio_frames: list[bytes], rate: int) -> None:
    """Writes a list of audio byte frames to a WAV file using scipy."""
    # Combine the list of byte frames into a single byte string
    raw_audio_bytes = b"".join(audio_frames)

    # Convert the raw bytes to a NumPy array.
    # The sample width is 2 bytes (16-bit), so we use np.int16
    audio_data = np.frombuffer(raw_audio_bytes, dtype=np.int16)

    # Write the NumPy array to a .wav file
    wavfile.write(filepath, rate, audio_data)
    print(f"Model response saved to {filepath}")


async def main() -> bool:
    print("Starting the code")

    async with client.aio.live.connect(
        model=MODEL,
        config=LiveConnectConfig(
            # Set Model responses to be in Audio
            response_modalities=[Modality.AUDIO],
            # To generate transcript for input audio
            input_audio_transcription=AudioTranscriptionConfig(),
            # To generate transcript for output audio
            output_audio_transcription=AudioTranscriptionConfig(),
        ),
    ) as session:

        async def send() -> None:
            # using local file as an example for live audio input
            wav_file_path = "hello_gemini_are_you_there.wav"
            base64_data, mime_type = read_wavefile(wav_file_path)
            audio_bytes = base64.b64decode(base64_data)
            await session.send_realtime_input(media=Blob(data=audio_bytes, mime_type=mime_type))

        async def receive() -> None:
            audio_frames = []

            async for message in session.receive():
                if message.server_content.input_transcription:
                    print(message.server_content.model_dump(mode="json", exclude_none=True))
                if message.server_content.output_transcription:
                    print(message.server_content.model_dump(mode="json", exclude_none=True))
                if message.server_content.model_turn:
                    for part in message.server_content.model_turn.parts:
                        if part.inline_data.data:
                            audio_data = part.inline_data.data
                            audio_frames.append(audio_data)

            if audio_frames:
                write_wavefile(
                    "example_model_response.wav",
                    audio_frames,
                    OUTPUT_RATE,
                )

        send_task = asyncio.create_task(send())
        receive_task = asyncio.create_task(receive())
        await asyncio.gather(send_task, receive_task)
        # Example response:
        #     gemini-2.0-flash-live-preview-04-09
        #     {'input_transcription': {'text': 'Hello.'}}
        #     {'output_transcription': {}}
        #     {'output_transcription': {'text': 'Hi'}}
        #     {'output_transcription': {'text': ' there. What can I do for you today?'}}
        #     {'output_transcription': {'finished': True}}
        #     Model response saved to example_model_response.wav

# [END googlegenaisdk_live_conversation_audio_with_audio]
    return True

if __name__ == "__main__":
    asyncio.run(main())
